{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# dataset path = E:\\Mycode\\UM_MDSAIA_Assignmentss\\CISC7201\\final_project\\processed_lianjia_data_filtered.csv\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "import pandas as pd\n",
    "\n",
    "# Define features and target variable\n",
    "df = pd.read_csv('E:\\\\Mycode\\\\UM_MDSAIA_Assignmentss\\\\CISC7201\\\\final_project\\\\processed_lianjia_data_filtered.csv')\n",
    "X = df.drop(columns=['price_per_sqm'])\n",
    "y = df['price_per_sqm']\n",
    "\n",
    "# Split the data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using device: cpu\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "# 检查是否有可用的GPU\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "print(f'Using device: {device}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Decision Tree Regressor MSE for price_per_sqm: 3014607.752442997\n",
      "MLP Regressor MSE for price_per_sqm: 373817.82477807254\n"
     ]
    }
   ],
   "source": [
    "# Define features and target variable for price_per_sqm prediction\n",
    "X_price = df.drop(columns=['price_per_sqm'])\n",
    "y_price = df['price_per_sqm']\n",
    "\n",
    "# Split the data into training and testing sets\n",
    "X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X_price, y_price, test_size=0.2, random_state=42)\n",
    "\n",
    "# Initialize the models\n",
    "dt_regressor_price = DecisionTreeRegressor(random_state=42)\n",
    "mlp_regressor_price = MLPRegressor(random_state=42, max_iter=1000)\n",
    "\n",
    "# Train the Decision Tree Regressor\n",
    "dt_regressor_price.fit(X_train_price, y_train_price)\n",
    "y_pred_dt_price = dt_regressor_price.predict(X_test_price)\n",
    "mse_dt_price = mean_squared_error(y_test_price, y_pred_dt_price)\n",
    "print(f'Decision Tree Regressor MSE for price_per_sqm: {mse_dt_price}')\n",
    "\n",
    "# Train the MLP Regressor\n",
    "mlp_regressor_price.fit(X_train_price, y_train_price)\n",
    "y_pred_mlp_price = mlp_regressor_price.predict(X_test_price)\n",
    "mse_mlp_price = mean_squared_error(y_test_price, y_pred_mlp_price)\n",
    "print(f'MLP Regressor MSE for price_per_sqm: {mse_mlp_price}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pure MLP Regressor MSE: 546370496.0\n",
      "MLP Regressor MSE with PCA: 1530772864.0\n",
      "Random Forest Regressor MSE with PCA: 210430252.7343058\n",
      "Pure Random Forest Regressor MSE: 6927773.97039163\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import numpy as np\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "\n",
    "# Define the MLP model\n",
    "class MLP(nn.Module):\n",
    "    def __init__(self, input_size):\n",
    "        super(MLP, self).__init__()\n",
    "        self.fc1 = nn.Linear(input_size, 128)\n",
    "        self.fc2 = nn.Linear(128, 64)\n",
    "        self.fc3 = nn.Linear(64, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = torch.relu(self.fc1(x))\n",
    "        x = torch.relu(self.fc2(x))\n",
    "        x = self.fc3(x)\n",
    "        return x\n",
    "\n",
    "# Convert data to PyTorch tensors\n",
    "X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)\n",
    "y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)\n",
    "X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)\n",
    "y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)\n",
    "\n",
    "# Initialize the MLP model, loss function, and optimizer\n",
    "mlp_model = MLP(X_train.shape[1])\n",
    "criterion = nn.MSELoss()\n",
    "optimizer = optim.Adam(mlp_model.parameters(), lr=0.001)\n",
    "\n",
    "# Train the MLP model\n",
    "num_epochs = 1000\n",
    "for epoch in range(num_epochs):\n",
    "    mlp_model.train()\n",
    "    optimizer.zero_grad()\n",
    "    outputs = mlp_model(X_train_tensor)\n",
    "    loss = criterion(outputs, y_train_tensor)\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "\n",
    "# Evaluate the MLP model\n",
    "mlp_model.eval()\n",
    "with torch.no_grad():\n",
    "    y_pred_mlp_tensor = mlp_model(X_test_tensor)\n",
    "    mse_mlp = criterion(y_pred_mlp_tensor, y_test_tensor).item()\n",
    "print(f'Pure MLP Regressor MSE: {mse_mlp}')\n",
    "\n",
    "# Basic implementation of Random Forest\n",
    "class RandomForest:\n",
    "    def __init__(self, n_estimators=100, max_depth=None):\n",
    "        self.n_estimators = n_estimators\n",
    "        self.max_depth = max_depth\n",
    "        self.trees = [DecisionTreeRegressor(max_depth=self.max_depth) for _ in range(self.n_estimators)]\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        for tree in self.trees:\n",
    "            bootstrap_indices = np.random.choice(np.arange(X.shape[0]), size=X.shape[0], replace=True)\n",
    "            X_bootstrap = X[bootstrap_indices]\n",
    "            y_bootstrap = y[bootstrap_indices]\n",
    "            tree.fit(X_bootstrap, y_bootstrap)\n",
    "\n",
    "    def predict(self, X):\n",
    "        tree_preds = np.array([tree.predict(X) for tree in self.trees])\n",
    "        return np.mean(tree_preds, axis=0)\n",
    "\n",
    "# Initialize and train the Random Forest model\n",
    "rf_model = RandomForest(n_estimators=100, max_depth=10)\n",
    "rf_model.fit(X_train.values, y_train.values)\n",
    "# Apply PCA to the dataset\n",
    "pca = PCA(n_components=0.95)  # Retain 95% of variance\n",
    "X_train_pca = pca.fit_transform(X_train)\n",
    "X_test_pca = pca.transform(X_test)\n",
    "\n",
    "# Convert PCA-transformed data to PyTorch tensors\n",
    "X_train_pca_tensor = torch.tensor(X_train_pca, dtype=torch.float32)\n",
    "X_test_pca_tensor = torch.tensor(X_test_pca, dtype=torch.float32)\n",
    "\n",
    "# Initialize and train the MLP model on PCA-transformed data\n",
    "mlp_model_pca = MLP(X_train_pca.shape[1])\n",
    "optimizer_pca = optim.Adam(mlp_model_pca.parameters(), lr=0.001)\n",
    "\n",
    "for epoch in range(num_epochs):\n",
    "    mlp_model_pca.train()\n",
    "    optimizer_pca.zero_grad()\n",
    "    outputs_pca = mlp_model_pca(X_train_pca_tensor)\n",
    "    loss_pca = criterion(outputs_pca, y_train_tensor)\n",
    "    loss_pca.backward()\n",
    "    optimizer_pca.step()\n",
    "\n",
    "# Evaluate the MLP model on PCA-transformed data\n",
    "mlp_model_pca.eval()\n",
    "with torch.no_grad():\n",
    "    y_pred_mlp_pca_tensor = mlp_model_pca(X_test_pca_tensor)\n",
    "    mse_mlp_pca = criterion(y_pred_mlp_pca_tensor, y_test_tensor).item()\n",
    "print(f'MLP Regressor MSE with PCA: {mse_mlp_pca}')\n",
    "\n",
    "# Initialize and train the Random Forest model on PCA-transformed data\n",
    "rf_model_pca = RandomForest(n_estimators=100, max_depth=10)\n",
    "rf_model_pca.fit(X_train_pca, y_train.values)\n",
    "\n",
    "# Evaluate the Random Forest model on PCA-transformed data\n",
    "y_pred_rf_pca = rf_model_pca.predict(X_test_pca)\n",
    "mse_rf_pca = mean_squared_error(y_test, y_pred_rf_pca)\n",
    "print(f'Random Forest Regressor MSE with PCA: {mse_rf_pca}')\n",
    "\n",
    "# Evaluate the Random Forest model\n",
    "y_pred_rf = rf_model.predict(X_test.values)\n",
    "mse_rf = mean_squared_error(y_test, y_pred_rf)\n",
    "print(f'Pure Random Forest Regressor MSE: {mse_rf}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pure Decision Tree Analysis for Prediction\n",
    "# Initialize the Decision Tree model\n",
    "dt_regressor = DecisionTreeRegressor(random_state=42)\n",
    "\n",
    "# Train the Decision Tree model\n",
    "dt_regressor.fit(X_train, y_train)\n",
    "\n",
    "# Predict on the test set\n",
    "y_pred_dt = dt_regressor.predict(X_test)\n",
    "\n",
    "# Calculate Mean Squared Error\n",
    "mse_dt = mean_squared_error(y_test, y_pred_dt)\n",
    "print(f'Mean Squared Error of the pure Decision Tree regression model: {mse_dt}')\n",
    "\n",
    "# Version with PCA followed by Random Forest\n",
    "# Perform PCA dimensionality reduction on the dataset\n",
    "pca = PCA(n_components=0.95)  # Retain 95% of variance\n",
    "X_train_pca = pca.fit_transform(X_train)\n",
    "X_test_pca = pca.transform(X_test)\n",
    "\n",
    "# Initialize the Random Forest model using individual Decision Trees\n",
    "class RandomForest:\n",
    "    def __init__(self, n_estimators=100, max_depth=None):\n",
    "        self.n_estimators = n_estimators\n",
    "        self.max_depth = max_depth\n",
    "        self.trees = [DecisionTreeRegressor(max_depth=self.max_depth) for _ in range(self.n_estimators)]\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        for tree in self.trees:\n",
    "            bootstrap_indices = np.random.choice(np.arange(X.shape[0]), size=X.shape[0], replace=True)\n",
    "            X_bootstrap = X[bootstrap_indices]\n",
    "            y_bootstrap = y[bootstrap_indices]\n",
    "            tree.fit(X_bootstrap, y_bootstrap)\n",
    "\n",
    "    def predict(self, X):\n",
    "        tree_preds = np.array([tree.predict(X) for tree in self.trees])\n",
    "        return np.mean(tree_preds, axis=0)\n",
    "\n",
    "# Initialize the Random Forest model\n",
    "rf_model_pca = RandomForest(n_estimators=100, max_depth=10)\n",
    "\n",
    "# Train the Random Forest model\n",
    "rf_model_pca.fit(X_train_pca, y_train.values.ravel())  # Ensure y_train is a 1D array\n",
    "\n",
    "# Predict on the test set\n",
    "y_pred_rf_pca = rf_model_pca.predict(X_test_pca)\n",
    "\n",
    "# Calculate Mean Squared Error\n",
    "mse_rf_pca = mean_squared_error(y_test, y_pred_rf_pca)\n",
    "# Initialize and train the pure Decision Tree model on PCA-transformed data\n",
    "dt_regressor_pca = DecisionTreeRegressor(random_state=42)\n",
    "dt_regressor_pca.fit(X_train_pca, y_train.values.ravel())\n",
    "\n",
    "# Predict on the test set\n",
    "y_pred_dt_pca = dt_regressor_pca.predict(X_test_pca)\n",
    "\n",
    "# Calculate Mean Squared Error\n",
    "mse_dt_pca = mean_squared_error(y_test, y_pred_dt_pca)\n",
    "print(f'Mean Squared Error of the pure Decision Tree regression model after PCA: {mse_dt_pca}')\n",
    "print(f'Mean Squared Error of the Random Forest regression model after PCA: {mse_rf_pca}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch [100/1000], Loss: 2162355456.0000\n",
      "Epoch [200/1000], Loss: 1006230784.0000\n",
      "Epoch [300/1000], Loss: 979214528.0000\n",
      "Epoch [400/1000], Loss: 960419264.0000\n",
      "Epoch [500/1000], Loss: 939120064.0000\n",
      "Epoch [600/1000], Loss: 913023104.0000\n",
      "Epoch [700/1000], Loss: 878947968.0000\n",
      "Epoch [800/1000], Loss: 830806400.0000\n",
      "Epoch [900/1000], Loss: 758280384.0000\n",
      "Epoch [1000/1000], Loss: 650082304.0000\n",
      "Gradual MLP Regressor MSE: 629248064.0\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "\n",
    "# 定义MLP模型\n",
    "class GradualMLP(nn.Module):\n",
    "    def __init__(self, input_size):\n",
    "        super(GradualMLP, self).__init__()\n",
    "        self.fc1 = nn.Linear(input_size, 128)\n",
    "        self.fc2 = nn.Linear(128, 64)\n",
    "        self.fc3 = nn.Linear(64, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = torch.relu(self.fc1(x))\n",
    "        x = torch.relu(self.fc2(x))\n",
    "        x = self.fc3(x)\n",
    "        return x\n",
    "\n",
    "# 转换数据为PyTorch张量\n",
    "X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)\n",
    "y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)\n",
    "X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)\n",
    "y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)\n",
    "\n",
    "# 初始化MLP模型、损失函数和优化器\n",
    "gradual_mlp_model = GradualMLP(X_train.shape[1])\n",
    "criterion = nn.MSELoss()\n",
    "optimizer = optim.Adam(gradual_mlp_model.parameters(), lr=0.001)\n",
    "\n",
    "# 训练MLP模型\n",
    "num_epochs = 1000\n",
    "for epoch in range(num_epochs):\n",
    "    gradual_mlp_model.train()\n",
    "    optimizer.zero_grad()\n",
    "    outputs = gradual_mlp_model(X_train_tensor)\n",
    "    loss = criterion(outputs, y_train_tensor)\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "\n",
    "    # 每100个epoch进行一次强化训练\n",
    "    if (epoch + 1) % 100 == 0:\n",
    "        with torch.no_grad():\n",
    "            y_pred_train = gradual_mlp_model(X_train_tensor)\n",
    "            train_loss = criterion(y_pred_train, y_train_tensor).item()\n",
    "            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {train_loss:.4f}')\n",
    "\n",
    "# 评估MLP模型\n",
    "gradual_mlp_model.eval()\n",
    "with torch.no_grad():\n",
    "    y_pred_gradual_mlp = gradual_mlp_model(X_test_tensor)\n",
    "    mse_gradual_mlp = criterion(y_pred_gradual_mlp, y_test_tensor).item()\n",
    "print(f'Gradual MLP Regressor MSE: {mse_gradual_mlp}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply PCA to the dataset to reduce dimensions from 79 to 54\n",
    "pca_54 = PCA(n_components=54)\n",
    "X_train_pca_54 = pca_54.fit_transform(X_train)\n",
    "X_test_pca_54 = pca_54.transform(X_test)\n",
    "\n",
    "# Convert PCA-transformed data to PyTorch tensors\n",
    "X_train_pca_54_tensor = torch.tensor(X_train_pca_54, dtype=torch.float32)\n",
    "X_test_pca_54_tensor = torch.tensor(X_test_pca_54, dtype=torch.float32)\n",
    "\n",
    "# Initialize and train the MLP model on PCA-transformed data\n",
    "mlp_model_pca_54 = MLP(X_train_pca_54.shape[1])\n",
    "optimizer_pca_54 = optim.Adam(mlp_model_pca_54.parameters(), lr=0.001)\n",
    "\n",
    "for epoch in range(num_epochs):\n",
    "    mlp_model_pca_54.train()\n",
    "    optimizer_pca_54.zero_grad()\n",
    "    outputs_pca_54 = mlp_model_pca_54(X_train_pca_54_tensor)\n",
    "    loss_pca_54 = criterion(outputs_pca_54, y_train_tensor)\n",
    "    loss_pca_54.backward()\n",
    "    optimizer_pca_54.step()\n",
    "\n",
    "# Evaluate the MLP model on PCA-transformed data\n",
    "mlp_model_pca_54.eval()\n",
    "with torch.no_grad():\n",
    "    y_pred_mlp_pca_54_tensor = mlp_model_pca_54(X_test_pca_54_tensor)\n",
    "    mse_mlp_pca_54 = criterion(y_pred_mlp_pca_54_tensor, y_test_tensor).item()\n",
    "print(f'MLP Regressor MSE with PCA (54 components): {mse_mlp_pca_54}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch [100/1000], Loss: 2004777216.0000\n",
      "Epoch [200/1000], Loss: 993264512.0000\n",
      "Epoch [300/1000], Loss: 973791296.0000\n",
      "Epoch [400/1000], Loss: 953398208.0000\n",
      "Epoch [500/1000], Loss: 929501888.0000\n",
      "Epoch [600/1000], Loss: 898565056.0000\n",
      "Epoch [700/1000], Loss: 854019136.0000\n",
      "Epoch [800/1000], Loss: 783311232.0000\n",
      "Epoch [900/1000], Loss: 668801024.0000\n",
      "Epoch [1000/1000], Loss: 512728480.0000\n",
      "Gradual MLP Regressor MSE: 494984896.0\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "\n",
    "# Define the GradualMLP model\n",
    "class GradualMLP(nn.Module):\n",
    "    def __init__(self, input_size):\n",
    "        super(GradualMLP, self).__init__()\n",
    "        self.fc1 = nn.Linear(input_size, 128)\n",
    "        self.fc2 = nn.Linear(128, 64)\n",
    "        self.fc3 = nn.Linear(64, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = torch.relu(self.fc1(x))\n",
    "        x = torch.relu(self.fc2(x))\n",
    "        x = self.fc3(x)\n",
    "        return x\n",
    "\n",
    "# Convert data to PyTorch tensors\n",
    "X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)\n",
    "y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)\n",
    "X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)\n",
    "y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)\n",
    "\n",
    "# Initialize the GradualMLP model, loss function, and optimizer\n",
    "gradual_mlp_model = GradualMLP(X_train.shape[1])\n",
    "criterion = nn.MSELoss()\n",
    "optimizer = optim.Adam(gradual_mlp_model.parameters(), lr=0.001)\n",
    "\n",
    "# Train the GradualMLP model\n",
    "num_epochs = 1000\n",
    "for epoch in range(num_epochs):\n",
    "    gradual_mlp_model.train()\n",
    "    optimizer.zero_grad()\n",
    "    outputs = gradual_mlp_model(X_train_tensor)\n",
    "    loss = criterion(outputs, y_train_tensor)\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "\n",
    "    # Reinforce training every 100 epochs\n",
    "    if (epoch + 1) % 100 == 0:\n",
    "        with torch.no_grad():\n",
    "            y_pred_train = gradual_mlp_model(X_train_tensor)\n",
    "            train_loss = criterion(y_pred_train, y_train_tensor).item()\n",
    "            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {train_loss:.4f}')\n",
    "\n",
    "# Evaluate the GradualMLP model\n",
    "gradual_mlp_model.eval()\n",
    "with torch.no_grad():\n",
    "    y_pred_gradual_mlp = gradual_mlp_model(X_test_tensor)\n",
    "    mse_gradual_mlp = criterion(y_pred_gradual_mlp, y_test_tensor).item()\n",
    "print(f'Gradual MLP Regressor MSE: {mse_gradual_mlp}')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cisc7021",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
